In other words, the goals are to:
# The first line sets an option for the final document that can be produced from
# the .Rmd file. Don't worry about it.
knitr::opts_chunk$set(echo = TRUE,
out.width="\\textwidth", # for larger figures
attr.output = 'style="max-height: 200px;"'
)
# The next bit is quite powerful and useful.
# First you define which packages you need for your analysis and assign it to
# the p_needed object.
p_needed <-
c("viridis", "MASS", "sandwich", "lmtest",
"ggplot2", "patchwork", "dplyr")
# Now you check which packages are already installed on your computer.
# The function installed.packages() returns a vector with all the installed
# packages.
packages <- rownames(installed.packages())
# Then you check which of the packages you need are not installed on your
# computer yet. Essentially you compare the vector p_needed with the vector
# packages. The result of this comparison is assigned to p_to_install.
p_to_install <- p_needed[!(p_needed %in% packages)]
# If at least one element is in p_to_install you then install those missing
# packages.
if (length(p_to_install) > 0) {
install.packages(p_to_install)
}
# Now that all packages are installed on the computer, you can load them for
# this project. Additionally the expression returns whether the packages were
# successfully loaded.
sapply(p_needed, require, character.only = TRUE)
## viridis MASS sandwich lmtest ggplot2 patchwork dplyr
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# This is an option for stargazer tables
# It automatically adapts the output to html or latex,
# depending on whether we want a html or pdf file
stargazer_opt <- ifelse(knitr::is_latex_output(), "latex", "html")
# only relevant for ggplot2 plotting
# setting a global ggplot theme for the entire document to avoid
# setting this individually for each plot
theme_set(theme_classic() + # start with classic theme
theme(
plot.background = element_blank(),# remove all background
plot.title.position = "plot", # move the plot title start slightly
legend.position = "bottom", # by default, put legend on the bottom
axis.line.y = element_blank(), # remove line on y axis
axis.ticks.y = element_blank(), # remove ticks on y axis
axis.text.y = element_blank() # remove text on y axis
))
load(file = "raw-data/election2013_2.RData")
df <- as.data.frame(election2013_2)
We regressed leftvote on unemployment and
east. Additionally, we include a multiplicative interaction
term unemployment*east.
reg <- lm(leftvote ~ unemployment + east +
unemployment*east,
data = df)
summary(reg)
##
## Call:
## lm(formula = leftvote ~ unemployment + east + unemployment *
## east, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8438 -0.8513 -0.1683 0.5337 11.4562
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.26144 0.30817 7.338 2.11e-12 ***
## unemployment 0.54859 0.04739 11.576 < 2e-16 ***
## east 16.10218 1.43807 11.197 < 2e-16 ***
## unemployment:east -0.13650 0.14407 -0.947 0.344
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.874 on 295 degrees of freedom
## Multiple R-squared: 0.9295, Adjusted R-squared: 0.9288
## F-statistic: 1297 on 3 and 295 DF, p-value: < 2.2e-16
We also coded a helpful function
quants_mean_fun <- function(x) {
c(quants = quantile(x, probs = c(0.025, 0.975)),
mean = mean(x))
}
And here are all the simulation steps for expected values:
# Step 1 - Get the regression coefficients
beta_hat <- coef(reg)
# Step 2.1. Get the variance-covariance matrix
V_hat <- vcov(reg)
# Step 2.2. Draw from the multivariate normal distribution.
nsim <- 1000
S <- mvrnorm(nsim, beta_hat, V_hat)
# Step 3 - Choose interesting covariate values.
unemp <- seq(0, 20, 0.1) # Range from 0 to 20, in steps of 0.1
scenario_east <- cbind(1, unemp, 1, unemp * 1)
scenario_west <- cbind(1, unemp, 0, unemp * 0)
# Step 4 - Calculate Quantities of Interest
EV_range_east <- S %*% t(scenario_east)
EV_range_west <- S %*% t(scenario_west)
# Quantiles, we again use apply and our quants.mean.fun
quants_range_east <- apply(EV_range_east, 2, quants_mean_fun)
quants_range_west <- apply(EV_range_west, 2, quants_mean_fun)
Visualize the results:
plot(
unemp,
quants_range_east[2,],
pch = 19,
cex = 0.3,
bty = "n",
las = 1,
ylim = c(0, 35),
ylab = "Voteshare (%)",
main = "Expected Voteshare (Die Linke)",
xlab = "Range of Unemployment",
type = "n"
)
points(df$unemployment,
df$leftvote,
pch = 19,
col = adjustcolor(viridis(3)[1], alpha = 0.8))
polygon(
x = c(unemp, rev(unemp)),
y = c(quants_range_east[1 ,], rev(quants_range_east[2 ,])),
col = adjustcolor(viridis(3)[2], alpha = 0.5),
border = NA
)
polygon(
x = c(unemp, rev(unemp)),
y = c(quants_range_west[1 ,], rev(quants_range_west[2 ,])),
col = adjustcolor(viridis(3)[3], alpha = 0.5),
border = NA
)
lines(unemp, quants_range_east[3,], col = viridis(3)[2]) # In this case I usually plot the polygons first and then the lines.
lines(unemp, quants_range_west[3,],
col = viridis(3)[3])
# Add a legend
legend("topleft",
lty = "solid",
col = viridis(3)[2:3],
legend = c("East", "West"),
cex = 0.8,
bty = "n")
Now we also want to simulate predicted values. What’s different from expected values?
X_east <- c(1, mean(df$unemployment), 1, mean(df$unemployment) * 1) # East
X_west <- c(1, mean(df$unemployment), 0, mean(df$unemployment) * 0) # West
X <- as.matrix(rbind(X_east, X_west))
This is still the same as above.
EV_combined <- S %*% t(X)
Now we need to add something. Remember sigma_est
(i.e. \(\hat\sigma\)) from last
lab/lecture? That’s the fundamental uncertainty!
# Y ~ N(mu_c, sigma_est)
sigma_est <- sqrt(sum(residuals(reg)^2) / (nrow(df) - length(beta_hat)))
Y_hat_east <- EV_combined[,1] + rnorm(nsim, 0, sigma_est)
Y_hat_west <- EV_combined[,2] + rnorm(nsim, 0, sigma_est)
# Quantiles
quants_east <- quants_mean_fun(Y_hat_east)
quants_west <- quants_mean_fun(Y_hat_west)
Let’s plot it:
# Histogram Predicted Values West Germany
hist(Y_hat_west,
las = 1,
main = "Histogram of Predicted Values (West Germany)",
col = viridis(3)[1],
border = "white")
abline(v = c(quants_west[1:3]), lty = 2, col = viridis(3)[3])
# Histogram Predicted Values East Germany
hist(Y_hat_east,
las = 1,
main = "Histogram of Predicted Values (East Germany)",
col = viridis(3)[2],
border = "white")
abline(v = c(quants_east[1:3]), lty = 2, col = viridis(3)[3])
# We could put both distributions in one plot.
plot(density(Y_hat_west),
xlim = c(0,40),
lwd = 2 ,
bty = "n",
yaxt = "n",
ylab = "",
xlab = "Left Voteshare in %",
main = "Predicted Values for Voteshare",
type = "n")
lines(density(Y_hat_west, from = min(Y_hat_west), to = max(Y_hat_west)), lwd = 2, col = viridis(3)[1])
lines(density(Y_hat_east, from = min(Y_hat_east), to = max(Y_hat_east)), lwd = 2, col = viridis(3)[2])
abline(v = c(quants_west[1:3]), lty = 2, col = viridis(3)[3])
abline(v = c(quants_east[1:3]), lty = 2, col = viridis(3)[3])
Let’s also do it for a range of unemployment.
EV_range_east <- S %*% t(scenario_east)
EV_range_west <- S %*% t(scenario_west)
Y_hat_range_east <- EV_range_east + rnorm(nsim, 0, sigma_est)
Y_hat_range_west <- EV_range_west + rnorm(nsim, 0, sigma_est)
# Quantiles, we again use apply and our quants_mean_fun
Y_quants_range_east <- apply(Y_hat_range_east, 2, quants_mean_fun)
Y_quants_range_west <- apply(Y_hat_range_west, 2, quants_mean_fun)
Plot it with polygons as confidence intervals.
plot(
unemp,
Y_quants_range_east[2, ],
las = 1,
bty = "n",
pch = 19,
cex = 0.3,
ylim = c(0, 45),
ylab = "Voteshare (%)",
main = "Predicted Voteshare (Die Linke)",
xlab = "Range of Unemployment",
type = "n"
)
points(df$unemployment,
df$leftvote ,
pch = 19,
col = adjustcolor(viridis(3)[1], alpha = 0.8))
polygon(
x = c(unemp, rev(unemp)),
y = c(Y_quants_range_east[1 , ], rev(Y_quants_range_east[2 , ])),
col = adjustcolor(viridis(3)[2], alpha = 0.5),
border = NA
)
polygon(
x = c(unemp, rev(unemp)),
y = c(Y_quants_range_west[1 , ], rev(Y_quants_range_west[2 , ])),
col = adjustcolor(viridis(3)[3], alpha = 0.5),
border = NA
)
lines(unemp, Y_quants_range_east[3, ],
col = viridis(3)[2])
lines(unemp, Y_quants_range_west[3, ],
col = viridis(3)[3])
# Add a legend
legend("topleft",
lty = "solid",
col = viridis(3)[2:3],
legend = c("East", "West"),
cex = 0.8,
bty = "n")
# data frame for EV east
plot_pv_west <- data.frame(t(Y_quants_range_west))
names(plot_pv_west) <- c("ci_lo", "ci_hi", "mean")
plot_pv_west$unemp <- unemp
plot_pv_west$east <- 0
# data frame for EV east
plot_pv_east <- data.frame(t(Y_quants_range_east))
names(plot_pv_east) <- c("ci_lo", "ci_hi", "mean")
plot_pv_east$unemp <- unemp
plot_pv_east$east <- 1
# combine data frames
plot_pv <- rbind(plot_pv_west, plot_pv_east)
# plot
ggplot(
data = df,
aes(x = unemployment, y = leftvote,
group = east)
) +
geom_point(
color = viridis(2, 0.5)[1]
) +
# add mean expected values
geom_line(data = plot_pv, aes(x = unemp,
y = mean,
group = east)) +
# add confidence intervals
geom_line(data = plot_pv, aes(x = unemp,
y = ci_lo,
group = east),
linetype = "dashed") +
geom_line(data = plot_pv, aes(x = unemp,
y = ci_hi,
group = east),
linetype = "dashed") +
theme_classic() +
labs(
x = "Unemployment in %",
y = "Predicted Voteshare (Die Linke) in %",
color = "",
title = "Left Voteshare and Unemployment"
) +
theme(legend.position = "none")
The confidence bounds are wider because we take the fundamental uncertainty of our model into account. To see this we can plot the expected values plot and predicted values plot side by side.
par(mfrow=c(1,2))
# Plot "Expected Voteshares" of Die Linke
plot(
unemp,
quants_range_east[2,],
pch = 19,
cex = 0.3,
bty = "n",
las = 1,
ylim = c(0, 45),
ylab = "Voteshare (%)",
main = "Expected Voteshare (Die Linke)",
xlab = "Range of Unemployment",
type = "n"
)
points(df$unemployment,
df$leftvote,
pch = 19,
col = adjustcolor(viridis(3)[1], alpha = 0.8))
polygon(
x = c(unemp, rev(unemp)),
y = c(quants_range_east[1 ,], rev(quants_range_east[2 ,])),
col = adjustcolor(viridis(3)[2], alpha = 0.5),
border = NA
)
polygon(
x = c(unemp, rev(unemp)),
y = c(quants_range_west[1 ,], rev(quants_range_west[2 ,])),
col = adjustcolor(viridis(3)[3], alpha = 0.5),
border = NA
)
lines(unemp, quants_range_east[3,], col = viridis(3)[2]) # In this case I usually plot the polygons first and then the lines.
lines(unemp, quants_range_west[3,],
col = viridis(3)[3])
# Add a legend
legend("topleft",
lty = "solid",
col = viridis(3)[2:3],
legend = c("East", "West"),
cex = 0.8,
bty = "n")
# Plot "Predicted Voteshares" of Die Linke
plot(
unemp,
Y_quants_range_east[2, ],
las = 1,
bty = "n",
pch = 19,
cex = 0.3,
ylim = c(0, 45),
ylab = "Voteshare (%)",
main = "Predicted Voteshare (Die Linke)",
xlab = "Range of Unemployment",
type = "n"
)
points(df$unemployment,
df$leftvote ,
pch = 19,
col = adjustcolor(viridis(3)[1], alpha = 0.8))
polygon(
x = c(unemp, rev(unemp)),
y = c(Y_quants_range_east[1 , ], rev(Y_quants_range_east[2 , ])),
col = adjustcolor(viridis(3)[2], alpha = 0.5),
border = NA
)
polygon(
x = c(unemp, rev(unemp)),
y = c(Y_quants_range_west[1 , ], rev(Y_quants_range_west[2 , ])),
col = adjustcolor(viridis(3)[3], alpha = 0.5),
border = NA
)
lines(unemp, Y_quants_range_east[3, ],
col = viridis(3)[2])
lines(unemp, Y_quants_range_west[3, ],
col = viridis(3)[3])
We’ll work with the simulation approach a lot and you will get more and more familiar with it as we apply it in various contexts.
Today, we introduce a new name for something we already did many times: Monte Carlo Simulation.
Steps for Monte Carlo Simulations:
To do this we need the MASS library (loaded in the setup
chunk).
Let’s start with an example you already know. What happens if we forget to include a variable in our model that is a confounder in the true data generating process?
{250px}
We start with the means:
mus <- c(5, 10)
mus
## [1] 5 10
We also need standard deviations:
sds <- c(4, 5)
sds_mat <- diag(sds)
sds_mat
## [,1] [,2]
## [1,] 4 0
## [2,] 0 5
Correlation matrix (we assume some multicollinearity). Why?
cor_mat <- matrix(c(1, 0.3, 0.3, 1), nrow = 2, ncol = 2)
cor_mat
## [,1] [,2]
## [1,] 1.0 0.3
## [2,] 0.3 1.0
Convert to variance covariance matrix
varcov <- sds_mat %*% cor_mat %*% sds_mat
# step-by-step
sds_mat %*% cor_mat
## [,1] [,2]
## [1,] 4.0 1.2
## [2,] 1.5 5.0
sds_mat %*% cor_mat %*% sds_mat
## [,1] [,2]
## [1,] 16 6
## [2,] 6 25
#check sqrt of diag:
sqrt(diag(varcov)) #our initial sds
## [1] 4 5
Now we can generate X by taking draws from a multivariate normal.
X <- mvrnorm(n = 100000,
mu = mus,
Sigma = varcov)
head(X)
## [,1] [,2]
## [1,] 7.596385 1.181996
## [2,] -2.625701 7.948951
## [3,] 8.690780 15.689219
## [4,] 6.565599 18.779475
## [5,] 7.728005 12.368167
## [6,] 6.225189 9.820113
cor(X[, 1], X[, 2]) # check correlation (we specified it in cor_mat)
## [1] 0.3021148
b <- c(10, # intercept
3, # beta_1
5) # beta_2
mu <- cbind(1, X) %*% b # systematic component
# Choose sigma_est
sigma_est <- 1
# Generate Y
Y <- rnorm(100000,
mean = mu, # systematic component
sd = sigma_est) # stochastic component
# Create population data
pop <- data.frame(cbind(Y, X))
head(pop)
## Y V2 V3
## 1 38.46094 7.596385 1.181996
## 2 42.56915 -2.625701 7.948951
## 3 114.52981 8.690780 15.689219
## 4 123.16810 6.565599 18.779475
## 5 95.57617 7.728005 12.368167
## 6 78.17619 6.225189 9.820113
We can put steps 1 to 3 in one function.
gen_pop <- function(mus, varcov, b, sigma_est){
X <- mvrnorm(n = 100000,
mu = mus,
Sigma = varcov)
mu_y <- cbind(1, X) %*% b
Y <- rnorm(100000,
mean = mu_y,
sd = sigma_est)
population <- data.frame(cbind(Y, X))
return(population)
}
# See if our function works
pop <- gen_pop(mus = mus,
varcov = varcov,
b = b,
sigma_est = sigma_est)
head(pop)
## Y V2 V3
## 1 -6.048382 6.615709 -7.066335
## 2 105.345943 10.900004 12.606267
## 3 65.766866 2.741620 9.387026
## 4 64.751920 9.340421 5.159298
## 5 134.964377 5.885883 21.374333
## 6 79.446339 6.501881 9.783187
Next, we need to:
and…
We do that in one function:
drawncalc_ovb <- function(pop){
pop_sample <- pop[sample(nrow(pop), size = 500), ] #dplyr::sample_n(pop, 500)
# regression without omitted variable bias
reg_nobias <- lm(Y ~ V2 + V3, data = pop_sample)
# regression with omitted variable bias
reg_bias <- lm(Y ~ V2, data = pop_sample)
# store and return beta_1 estimate of both models
b1_hat <- c(summary(reg_nobias)$coefficients[2, 1],
summary(reg_bias)$coefficients[2, 1])
return(b1_hat)
}
b_hats_ovb <- t(replicate(1000, drawncalc_ovb(pop = pop)))
head(b_hats_ovb) # 2 cols with 1000 regression coefficients
## [,1] [,2]
## [1,] 2.998042 5.050788
## [2,] 2.996611 5.109977
## [3,] 3.017213 5.060918
## [4,] 3.015060 4.275836
## [5,] 2.998681 4.973375
## [6,] 3.007050 4.855191
two_cols <- viridis(2)
two_cols_alpha <- viridis(2, 0.5)
#We plot both the biased and unbiased estimates
plot(density(b_hats_ovb[, 1]),
xlim = c(2.5,6.5),
main = "Omitted Variable Bias",
col = two_cols[1],
bty = "n",
las = 1,
ylab = "")
polygon(density(b_hats_ovb[, 1]),
col = two_cols_alpha[1],
border = F)
lines(density(b_hats_ovb[, 2]),
col = two_cols[2])
polygon(density(b_hats_ovb[, 2]),
col = two_cols_alpha[2],
border = F)
abline(v = c(mean(b_hats_ovb[, 1]),
mean(b_hats_ovb[, 2])),
col = two_cols,
lty = 2)
Omitted variable bias can dramatically bias our estimates of interest.
# put data to long format (better practice in ggplot2)
# instead of 2 columns, make one with another column identifying the model
b_hats_plot <- data.frame(
estimate = c(b_hats_ovb[, 1], b_hats_ovb[, 2]),
Model = rep(c("Unbiased", "Biased"), each = nrow(b_hats_ovb))
)
ggplot(b_hats_plot,
aes(x = estimate, fill = Model, color = Model)
) +
geom_density() + # add two density plots
scale_color_viridis(discrete = T, direction = -1) + # change colors with viridis palette
scale_fill_viridis(discrete = T, alpha = 0.5, direction = -1) + # fill with viridis
labs(
title = "Omitted Variable Bias",
x = "",
y = "",
fill = "Model"
) +
geom_vline(
xintercept = apply(b_hats_ovb, 2, mean), # object with two columns
color = viridis(2)
) +
geom_hline( # remove the bottom line for cleaner look
yintercept = 0,
color = "white" # grid line color
)
Omitted variable bias can dramatically bias our estimates of interest.
We start with the means
mus <- c(5, 10)
We also need standard deviations
sds <- c(4, 5)
sds_mat <- diag(sds) # construct a diagonal matrix
sds_mat
## [,1] [,2]
## [1,] 4 0
## [2,] 0 5
Correlation matrix (for now we assume no multicollinearity).
cor_mat <- matrix(data = c(1, 0, 0, 1), nrow = 2, ncol = 2)
cor_mat
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
Convert to variance-covariance matrix:
varcov <- sds_mat %*% cor_mat %*% sds_mat
varcov
## [,1] [,2]
## [1,] 16 0
## [2,] 0 25
Now we can generate X
X <- mvrnorm(n = 100000,
mu = mus,
Sigma = varcov)
cor(X[, 1], X[, 2]) # check correlation
## [1] 0.001342222
b <- c(10, 3, 5)
mu <- cbind(1, X) %*% b # systematic component
# Choose sigma_est
sigma_est <- 1
# Generate Y
Y <- rnorm(100000,
mean = mu, # systematic component
sd = sigma_est) # stochastic component
Now we need to add some measurement error in X (e.g., in V2)
sigma_err <- 0.5
X[, 1] <- X[, 1] + rnorm(length(X[, 1]), 0, sigma_err) # X-star from the lecture
# Population data
pop <- data.frame(cbind(Y, X))
head(pop)
## Y V2 V3
## 1 75.0174427 7.942766 7.896190
## 2 104.2665025 8.091110 14.441986
## 3 85.2890589 2.425802 13.489344
## 4 54.2641485 -3.586130 10.781174
## 5 -0.9287311 -1.636571 -1.270940
## 6 54.2242427 11.522858 2.387464
We can modify our gen_pop function accordingly.
gen_pop <- function(mus, varcov, b, sigma_est, sigma_err){
X <- mvrnorm(n = 100000, mu = mus, Sigma = varcov)
mu_y <- cbind(1, X) %*% b
Y <- rnorm(100000,
mu_y,
sigma_est)
# Add measurement error in V2
X[, 1] <- X[, 1] + rnorm(length(X[, 1]), 0, sigma_err)
population <- data.frame(cbind(Y, X))
return(population)
}
Perfect, let’s set up some scenarios using our function.
pop1 <- gen_pop(mus = mus,
varcov = varcov,
b = b,
sigma_est = sigma_est,
sigma_err = 0)
pop2 <- gen_pop(mus = mus,
varcov = varcov,
b = b,
sigma_est = sigma_est,
sigma_err = 0.25)
pop3 <- gen_pop(mus = mus,
varcov = varcov,
b = b,
sigma_est = sigma_est,
sigma_err = 0.5)
pop4 <- gen_pop(mus = mus,
varcov = varcov,
b = b,
sigma_est = sigma_est,
sigma_err = 1)
Next, we need to:
and …
We do that in one function:
drawncalc <- function(pop){
pop_sample <- pop[sample(nrow(pop), size = 500),]
reg <- lm(Y ~ V2 + V3,
data = pop_sample)
# get coefficient and standard error
b1_hat <- summary(reg)$coefficients[2, 1:2]
return(b1_hat)
}
We do this for the different scenarios.
# without measurement error
b_hats_no <- t(replicate(1000, drawncalc(pop = pop1)))
# with small measurement error
b_hats_small <- t(replicate(1000, drawncalc(pop = pop2)))
# with high measurement error
b_hats_high <- t(replicate(1000, drawncalc(pop = pop3)))
# with very high measurement error
b_hats_veryhigh <- t(replicate(1000, drawncalc(pop = pop4)))
Plots help us here.
four_cols <- viridis(4)
four_cols_alpha <- viridis(4, 0.5)
par(mfrow = c(1, 2))
# first plot
plot(density(b_hats_no[, 1]),
xlim = c(2.7, 3.05),
main = "Point Estimate",
bty = "n",
las = 1,
yaxt = "n",
ylab = "",
xlab = "",
type = "n")
polygon(density(b_hats_no[, 1]),
col = four_cols_alpha[1],
border = F)
polygon(density(b_hats_small[, 1]),
col = four_cols_alpha[2],
border = F)
polygon(density(b_hats_high[, 1]),
col = four_cols_alpha[3],
border = F)
polygon(density(b_hats_veryhigh[, 1]),
col = four_cols_alpha[4],
border = F)
lines(density(b_hats_no[, 1]),
col = four_cols[1])
lines(density(b_hats_small[, 1]),
col = four_cols[2])
lines(density(b_hats_high[, 1]),
col = four_cols[3])
lines(density(b_hats_veryhigh[, 1]),
col = four_cols[4])
abline(v = c(mean(b_hats_no[, 1]),
mean(b_hats_small[, 1]),
mean(b_hats_high[, 1]),
mean(b_hats_veryhigh[, 1])),
col = four_cols, lty = 2)
legend("topleft",
legend = c("No", "Small", "High", "Very high"),
col = four_cols,
lty = "dashed",
cex = 0.7,
bty = "n")
# second plot
plot(density(b_hats_no[, 2]),
xlim = c(0.009, 0.04),
main = "Standard Error",
bty = "n",
las = 1,
yaxt = "n",
ylab = "",
xlab = "",
type = "n")
polygon(density(b_hats_no[, 2]),
col = four_cols_alpha[1],
border = F)
polygon(density(b_hats_small[, 2]),
col = four_cols_alpha[2],
border = F)
polygon(density(b_hats_high[, 2]),
col = four_cols_alpha[3],
border = F)
polygon(density(b_hats_veryhigh[, 2]),
col = four_cols_alpha[4],
border = F)
lines(density(b_hats_no[, 2]),
col = four_cols[1])
lines(density(b_hats_small[, 2]),
col = four_cols[2])
lines(density(b_hats_high[, 2]),
col = four_cols[3])
lines(density(b_hats_veryhigh[, 2]),
col = four_cols[4])
abline(v = c(mean(b_hats_no[, 2]),
mean(b_hats_small[, 2]),
mean(b_hats_high[, 2]),
mean(b_hats_veryhigh[, 2])),
col = four_cols,
lty = 2)
How is the relationship between measurement error in V2 and the point estimate and standard error?
plot(x = c(0, 0.25, 0.5, 1),
y = c(mean(b_hats_no[, 1]),
mean(b_hats_small[, 1]),
mean(b_hats_high[, 1]),
mean(b_hats_veryhigh[, 1])),
pch = 19,
col = viridis(1, 0.8),
main = "Measurement Error and Point Estimate",
xlab = "Measurement Error",
ylab = "Point Estimate",
bty = "n",
las = 1)
plot(x = c(0, 0.25, 0.5, 1),
y = c(mean(b_hats_no[, 2]),
mean(b_hats_small[, 2]),
mean(b_hats_high[, 2]),
mean(b_hats_veryhigh[, 2])),
pch = 19,
col = viridis(1, 0.8),
main = "Measurement Error and Standard Error",
xlab = "Measurement Error",
ylab = "Standard Error",
bty = "n",
las = 1)
This attenuation bias, which is introduced through measurement error in X, biases our cofficients towards zero.
In the exercise section, you will see how measurement error in Y affects our coefficients.
b_hats <- as.data.frame(rbind(b_hats_no,
b_hats_small,
b_hats_high,
b_hats_veryhigh))
b_hats$error <-
factor(rep(c("No", "Small", "High", "Very High"), each = nrow(b_hats) / 4),
levels = c("No", "Small", "High", "Very High"))
# first plot
p1 <- ggplot(data = b_hats, aes(x = Estimate, fill = error, color = error)) +
labs(
title = "Point Estimate",
x = "",
y = "",
fill = "Measurement Error",
color = "Measurement Error"
) +
geom_density() + # add densities
geom_vline( # manually add lines with means
xintercept = b_hats %>% # take b_hats
group_by(error) %>% # apply all calculations within groups of error
summarise(mean = mean(Estimate)) %>% # take mean of Estimate column
pull(mean), # pull the mean column
linetype = "dashed",
color = viridis(4)
) +
scale_fill_viridis(discrete = T, alpha = 0.5) +
scale_color_viridis(discrete = T)
# second plot
p2 <- ggplot(data = b_hats, aes(x = `Std. Error`, fill = error, color= error)) +
labs(
title = "Standard Error",
x = "",
y = "",
fill = "Measurement Error",
color = "Measurement Error"
) +
geom_density() +
geom_vline( # manually add lines with means
xintercept = b_hats %>% # take b_hats
group_by(error) %>% # apply all calculations within groups of error
summarise(mean = mean(`Std. Error`)) %>% # take mean of `Std. Error` column
pull(mean), # pull the mean column
linetype = "dashed",
color = viridis(4)
) +
scale_fill_viridis(discrete = T, alpha = 0.5) +
scale_color_viridis(discrete = T)
# put plots together with patchwork package
p1 + p2 + plot_layout(guides = "collect") &
geom_hline( # remove the bottom line for cleaner look
yintercept = 0,
color = "white" # grid line color
)
How is the relationship between measurement error in V2 and the point estimate and standard error?
p1 <- b_hats %>% # take b_hats
group_by(error) %>% # apply all calculations within groups of error
summarise_all(mean) %>% # summarize all variables
ggplot() +
geom_point(
aes(x = c(0, 0.25, 0.5, 1),
y = Estimate)
) +
labs(
x = "Measurement Error",
y = "Point Estimate"
) +
theme_classic() # restore to classic theme with no modifications
p2 <- b_hats %>% # take b_hats
group_by(error) %>% # apply all calculations within groups of error
summarise_all(mean) %>%
ggplot() +
geom_point(
aes(x = c(0, 0.25, 0.5, 1),
y = `Std. Error`)
) +
labs(
x = "Measurement Error",
y = "Standard Error"
) +
theme_classic()
p1 + p2 + plot_annotation(title = "The Effect of Measurement Error in X on Estimates")
This attenuation bias, which is introduced through measurement error in X, biases our cofficients towards zero.
In the exercise section, you will see how measurement error in Y affects our coefficients.
Steps 1 to 3: Again, we only need to modify our gen_pop
function from above. What changes?
gen_pop <- function(mus, varcov, b, sigma_est){
# Generate IV
X <- mvrnorm(n = 100000,
mu = mus,
Sigma = varcov)
mu_y <- cbind(1, X) %*% b
Y <- rnorm(100000,
mean = mu_y,
sd = sigma_est)
population <- data.frame(cbind(Y, X))
return(population)
}
Let’s set up some scenarios:
# a) No multicollinearity
cor_mat1 <- matrix(c(1, 0, 0, 1),
nrow = 2,
ncol = 2)
varcov1 <- sds_mat %*% cor_mat1 %*% sds_mat
pop1 <- gen_pop(mus = mus,
varcov = varcov1,
b = b,
sigma_est = sigma_est)
# b) Small multicollinearity
cor_mat2 <- matrix(c(1, 0.3, 0.3, 1),
nrow = 2,
ncol = 2)
varcov2 <- sds_mat %*% cor_mat2 %*% sds_mat
pop2 <- gen_pop(mus = mus,
varcov = varcov2,
b = b,
sigma_est = sigma_est)
# c) High multicollinearity
cor_mat3 <- matrix(c(1, 0.6, 0.6, 1), nrow = 2, ncol = 2)
varcov3 <- sds_mat %*% cor_mat3 %*% sds_mat
pop3 <- gen_pop(mus = mus,
varcov = varcov3,
b = b,
sigma_est = sigma_est)
# d) Very high multicollinearity
cor_mat4 <- matrix(c(1, 0.9, 0.9, 1),
nrow = 2,
ncol = 2)
varcov4 <- sds_mat %*% cor_mat4 %*% sds_mat
pop4 <- gen_pop(mus = mus,
varcov = varcov4,
b = b,
sigma_est = sigma_est)
Steps 4 to 6 are the same as above. (And we can use the same function.)
b_hats_no <- t(replicate(1000, drawncalc(pop = pop1)))
b_hats_small <- t(replicate(1000, drawncalc(pop = pop2)))
b_hats_high <- t(replicate(1000, drawncalc(pop = pop3)))
b_hats_veryhigh <- t(replicate(1000, drawncalc(pop = pop4)))
par(mfrow = c(1, 2))
# first plot
plot(density(b_hats_no[, 1]),
xlim = c(2.9, 3.1),
main = "Point Estimate",
bty = "n",
las = 1,
yaxt = "n",
ylab = "",
xlab = "",
type = "n")
polygon(density(b_hats_no[, 1]),
col = four_cols_alpha[1],
border = F)
polygon(density(b_hats_small[, 1]),
col = four_cols_alpha[2],
border = F)
polygon(density(b_hats_high[, 1]),
col = four_cols_alpha[3],
border = F)
polygon(density(b_hats_veryhigh[, 1]),
col = four_cols_alpha[4],
border = F)
lines(density(b_hats_no[, 1]),
col = four_cols[1])
lines(density(b_hats_small[, 1]),
col = four_cols[2])
lines(density(b_hats_high[, 1]),
col = four_cols[3])
lines(density(b_hats_veryhigh[, 1]),
col = four_cols[4])
abline(v = c(mean(b_hats_no[, 1]),
mean(b_hats_small[, 1]),
mean(b_hats_high[, 1]),
mean(b_hats_veryhigh[, 1])),
col = four_cols, lty = 2)
legend("topleft",
legend = c("No", "Small", "High", "Very high"),
col = four_cols,
lty = "dashed",
cex = 0.7,
bty = "n")
# second plot
plot(density(b_hats_no[, 2]),
xlim = c(0.009, 0.031),
main = "Standard Error",
bty = "n",
las = 1,
yaxt = "n",
ylab = "",
xlab = "",
type = "n")
polygon(density(b_hats_no[, 2]),
col = four_cols_alpha[1],
border = F)
polygon(density(b_hats_small[, 2]),
col = four_cols_alpha[2],
border = F)
polygon(density(b_hats_high[, 2]),
col = four_cols_alpha[3],
border = F)
polygon(density(b_hats_veryhigh[, 2]),
col = four_cols_alpha[4],
border = F)
lines(density(b_hats_no[, 2]),
col = four_cols[1])
lines(density(b_hats_small[, 2]),
col = four_cols[2])
lines(density(b_hats_high[, 2]),
col = four_cols[3])
lines(density(b_hats_veryhigh[, 2]),
col = four_cols[4])
abline(v = c(mean(b_hats_no[, 2]),
mean(b_hats_small[, 2]),
mean(b_hats_high[, 2]),
mean(b_hats_veryhigh[, 2])),
col = four_cols,
lty = 2)
How is the relationship between multicollinearity and standard error?
plot(x = c(0, 0.3, 0.6, 0.9),
y = c(mean(b_hats_no[, 2]),
mean(b_hats_small[, 2]),
mean(b_hats_high[, 2]),
mean(b_hats_veryhigh[, 2])),
pch = 19,
las = 1,
col = viridis(1, 0.75),
main = "Multicollinearity and Standard Error",
xlab = "Correlation",
ylab = "Standard Error",
bty = "n")
Multicollinearity increases the sampling variance of the OLS coefficients. This in turn increases the SEs and CIs of the coefficients. The good news is that even strong multicollinearity does not bias our coefficients.
b_hats <- as.data.frame(rbind(b_hats_no,
b_hats_small,
b_hats_high,
b_hats_veryhigh))
b_hats$mc <- factor(rep(c("No", "Small", "High", "Very High"), each = nrow(b_hats) / 4),
levels = c("No", "Small", "High", "Very High"))
# first plot
p1 <- ggplot(data = b_hats, aes(x = Estimate, fill = mc, color = mc)) +
labs(
title = "Point Estimate",
x = "",
y = "",
fill = "Multicollinearity",
color = "Multicollinearity"
) +
geom_density() + # add densities
geom_vline( # manually add lines with means
xintercept = b_hats %>% # take b_hats
group_by(mc) %>% # apply all calculations within groups of mc
summarise(mean = mean(Estimate)) %>% # take mean of Estimate column
pull(mean), # pull the mean column
linetype = "dashed",
color = viridis(4)
) +
scale_fill_viridis(discrete = T, alpha = 0.5) +
scale_color_viridis(discrete = T)
p2 <- ggplot(data = b_hats, aes(x = `Std. Error`, fill = mc, color = mc)) +
labs(
title = "Standard Error",
x = "",
y = "",
fill = "Multicollinearity",
color = "Multicollinearity"
) +
geom_density() + # add densities
geom_vline( # manually add lines with means
xintercept = b_hats %>% # take b_hats
group_by(mc) %>% # apply all calculations within groups of mc
summarise(mean = mean(`Std. Error`)) %>% # take mean of `Std. Error` column
pull(mean), # pull the mean column
linetype = "dashed",
color = viridis(4)
) +
scale_fill_viridis(discrete = T, alpha = 0.5) +
scale_color_viridis(discrete = T)
p1 + p2 + plot_layout(guides = "collect") &
geom_hline( # remove the bottom line for cleaner look
yintercept = 0,
color = "white" # grid line color
)
How is the relationship between multicollinearity and standard error?
b_hats %>% # take b_hats
group_by(mc) %>% # apply all calculations within groups of mc
summarise_all(mean) %>%
ggplot() +
geom_point(aes(x = c(0, 0.3, 0.6, 0.9),
y = `Std. Error`)) +
labs(x = "Multicollinearity",
title = "Multicollinearity and Standard Error",
y = "Standard Error") +
theme_classic() +
scale_x_continuous(breaks = c(0, 0.3, 0.6, 0.9))
Multicollinearity increases the sampling variance of the OLS coefficients. This in turn increases the SEs and CIs of the coefficients. The good news is that even strong multicollinearity does not bias our coefficients.
# Generate some data
x <- runif(100, 0.2, 1)
e <- rnorm(100, 0, 0.5)
y1 <- 2 * x + e # This creates homoscedastic data.
If the variance of the error term is correlated with \(x\), we get heteroscedastic data.
y2 <- 2 * x + e * x^2
par(mfrow = c(1, 2))
# Homoscedastic data
plot(x = x,
y = y1,
pch = 19,
col = viridis(1, 0.75)[1],
bty = "n",
las = 1,
main = "Homoskedastic data")
# Heteroskedastic data
plot(x = x,
y = y2,
pch = 19,
col = viridis(1, 0.75)[1],
bty = "n",
las = 1,
main = "Heteroskedastic data")
# Homoscedastic data
p_hom <- ggplot() +
geom_point(aes(x = x, y = y1),
size = 2,
color = viridis(1, 0.75)[1]
) +
labs(
title = "Homoskedastic data",
x = "x",
y = "y1"
) +
theme_classic()
# Heteroskedastic data
p_het <- ggplot() +
geom_point(aes(x = x, y = y2),
color = viridis(1, 0.75)[1],
size = 2
) +
labs(
title = "Heteroskedastic data",
x = "x",
y = "y2"
) +
theme_classic()
p_hom + p_het
reg_hom <- lm(y1 ~ x)
reg_het <- lm(y2 ~ x)
Have a look at the residuals.
par(mfrow = c(1, 2))
plot(x = fitted.values(reg_hom),
y = residuals(reg_hom),
pch = 19,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Homoscedasticity",
col = viridis(1, 0.75),
bty = "n",
ylim = c(-1.5, 1),
las = 1)
abline(h = 0)
plot(x = fitted.values(reg_het),
y = residuals(reg_het),
pch = 19,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Heteroscedasticity",
col = viridis(1, 0.75),
bty = "n",
ylim = c(-1.5, 1),
las = 1)
abline(h = 0)
p1 <- ggplot(data = reg_hom$model,
aes(x = x, y = residuals(reg_hom))) +
geom_point(color = viridis(1, 0.75), # add points
size = 2) +
theme_classic() + # change the appearance
labs(x = "X",
y = "Residuals",
title = "Homoscedasticity") +
geom_hline(yintercept = 0, # add the line
size = 0.5) +
scale_y_continuous(limits = c(-1.5, 2))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- ggplot(data = reg_hom$model,
aes(x = x, y = residuals(reg_het))) +
geom_point(color = viridis(1, 0.75), # add points
size = 2) +
theme_classic() + # change the appearance
labs(x = "X",
y = "Residuals",
title = "Heteroscedasticity") +
geom_hline(yintercept = 0, # add the line
size = 0.5) +
scale_y_continuous(limits = c(-1.5, 2))
p1 + p2
sandwich packagedata <- as.data.frame(cbind(y2, x))
reg_ols <- lm(y2 ~ x,
data = data)
summary(reg_ols)
##
## Call:
## lm(formula = y2 ~ x, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64178 -0.08214 -0.02146 0.06237 0.92234
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.05062 0.05373 0.942 0.349
## x 1.91473 0.09009 21.254 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2097 on 98 degrees of freedom
## Multiple R-squared: 0.8217, Adjusted R-squared: 0.8199
## F-statistic: 451.7 on 1 and 98 DF, p-value: < 2.2e-16
vcovHC(reg_ols)
## (Intercept) x
## (Intercept) 0.002200636 -0.005187228
## x -0.005187228 0.013099920
sqrt(diag(vcovHC(reg_ols)))
## (Intercept) x
## 0.04691094 0.11445488
coeftest(reg_ols, vcov. = vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.050618 0.046911 1.079 0.2832
## x 1.914733 0.114455 16.729 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
However, differences between robust and normal standard errors often reveal problems with your model specification.
See for example: King and Roberts 2015: How Robust Standard Errors Expose Methodological Problems They Do Not Fix, and What to Do About It.
In AQM next semester you will learn how to model heteroscedastic regressions.
# Some fake data
x <- c(rnorm(99, 0, 1), 5)
y <- c(rnorm(99, 1, 1), 20)
data <- data.frame(cbind(x, y))
m1 <- lm(y ~ x, data = data)
summary(m1)
##
## Call:
## lm(formula = y ~ x, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1556 -0.9727 -0.0238 0.8218 14.4951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2457 0.1911 6.520 3.07e-09 ***
## x 0.8519 0.1740 4.896 3.85e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.91 on 98 degrees of freedom
## Multiple R-squared: 0.1966, Adjusted R-squared: 0.1884
## F-statistic: 23.97 on 1 and 98 DF, p-value: 3.847e-06
plot(x, y,
pch = 19,
col = two_cols_alpha[1],
bty = "n",
las = 1)
abline(lm(y ~ x), col = two_cols[1])
ggplot() +
geom_point(aes(x = x, y = y),
color = viridis(1, alpha = 0.5), # add points
size = 2) +
geom_abline(intercept = m1$coefficients[1], # add the line
slope = m1$coefficients[2],
color = viridis(1),
size = 1) +
theme_classic()
# calculate Cook's distance
D <- cooks.distance(m1)
# ...and plot it
plot(D,
main = "Cook's Distance",
pch = 19,
col = two_cols_alpha[1],
bty = "n",
las = 1)
Drop observations (here it is just the one observation).
s <- D < 1
Run the regression without the outlier.
m2 <- lm(y ~ x,
data = data[s,])
summary(m2)
##
## Call:
## lm(formula = y ~ x, data = data[s, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.22717 -0.72367 0.04444 0.67031 2.45732
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.06721 0.09662 11.046 <2e-16 ***
## x 0.08596 0.09832 0.874 0.384
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9605 on 97 degrees of freedom
## Multiple R-squared: 0.007818, Adjusted R-squared: -0.002411
## F-statistic: 0.7643 on 1 and 97 DF, p-value: 0.3841
Let’s compare the two models.
par(mfrow = c(1, 2))
# First the model without excluding the outlier
plot(x = x,
y = y,
xlim = c(-3, 5),
ylim = c(-1.5, 20),
main = "Model with Outlier",
pch = 19,
col = two_cols_alpha[1],
bty = "n", las = 1)
abline(m1, col = two_cols[1])
# Second the model excluding the outlier
plot(x = x[s],
y = y[s],
xlim = c(-3, 5),
ylim = c(-1.5, 20),
main = "Model without Outlier",
pch = 19,
col = two_cols_alpha[1],
bty = "n",
las = 1)
abline(m2, col = two_cols[1])
Outliers can heavily influence our estimates. Usually, we do not want the results of regression analyses to depend on few influential outliers.
# First the model without excluding the outlier
p1 <- ggplot() +
geom_point(aes(x = x, y = y),
color = viridis(1, alpha = 0.5), # add points
size = 2) +
geom_abline(intercept = m1$coefficients[1], # add the line
slope = m1$coefficients[2],
color = viridis(1),
size = 1) +
theme_classic() +
labs(title = "Model with outlier")
# Second the model excluding the outlier
p2 <- ggplot() +
geom_point(aes(x = x, y = y),
color = viridis(1, alpha = 0.5), # add points
size = 2) +
geom_abline(intercept = m2$coefficients[1], # add the line
slope = m2$coefficients[2],
color = viridis(1),
size = 1) +
theme_classic() +
labs(title = "Model without outlier")
p1 + p2
Outliers can heavily influence our estimates. Usually, we do not want the results of regression analyses to depend on few influential outliers.
What happens if we have measurement error for the dependent variable Y? Run a Monte Carlo simulation for measurement error in Y (Without measurement error in V2 or V3.)
For this, you will have to adjust the gen_pop function. Note: To add measurement error in Y, you need to add only one line of code.
gen_pop <- function(mus, varcov, b, sigma_est, sigma_err){
# Generate IV
X <- mvrnorm(n = 100000,
mu = mus,
Sigma = varcov)
mu_y <- cbind(1,X) %*% b
Y <- rnorm(100000,
mean = mu_y,
sd = sigma_est)
# Add measurement error in Y
# Y <- ???
population <- data.frame(cbind(Y, X))
return(population)
}
Now run the following code to see how different levels of measurement error in Y affect our estimates:
We assume no multicollinearity
cor_mat1 <- matrix(c(1, 0, 0, 1),
nrow = 2,
ncol = 2)
varcov1 <- sds_mat %*% cor_mat1 %*% sds_mat
# No measurement error
pop1 <- gen_pop(mus = mus,
varcov = varcov1,
b = b,
sigma_est = sigma_est,
sigma_err = 0)
# Small measurement error
pop2 <- gen_pop(mus = mus,
varcov = varcov1,
b = b,
sigma_est = sigma_est,
sigma_err = 0.25)
# High measurement error
pop3 <- gen_pop(mus = mus,
varcov = varcov1,
b = b,
sigma_est = sigma_est,
sigma_err = 0.5)
# Very high measurement error
pop4 <- gen_pop(mus = mus,
varcov = varcov1,
b = b,
sigma_est = sigma_est,
sigma_err = 1)
Simulate n times
b_hats_no <- t(replicate(1000, drawncalc(pop = pop1)))
b_hats_small <- t(replicate(1000, drawncalc(pop = pop2)))
b_hats_high <- t(replicate(1000, drawncalc(pop = pop3)))
b_hats_veryhigh <- t(replicate(1000, drawncalc(pop = pop4)))
par(mfrow = c(1, 2))
# first plot
plot(density(b_hats_no[, 1]),
xlim = c(2.7, 3.05),
main = "Point Estimate",
bty = "n",
las = 1,
yaxt = "n",
ylab = "",
xlab = "",
type = "n")
polygon(density(b_hats_no[, 1]),
col = four_cols_alpha[1],
border = F)
polygon(density(b_hats_small[, 1]),
col = four_cols_alpha[2],
border = F)
polygon(density(b_hats_high[, 1]),
col = four_cols_alpha[3],
border = F)
polygon(density(b_hats_veryhigh[, 1]),
col = four_cols_alpha[4],
border = F)
lines(density(b_hats_no[, 1]),
col = four_cols[1])
lines(density(b_hats_small[, 1]),
col = four_cols[2])
lines(density(b_hats_high[, 1]),
col = four_cols[3])
lines(density(b_hats_veryhigh[, 1]),
col = four_cols[4])
abline(v = c(mean(b_hats_no[, 1]),
mean(b_hats_small[, 1]),
mean(b_hats_high[, 1]),
mean(b_hats_veryhigh[, 1])),
col = four_cols, lty = 2)
legend("topleft",
legend = c("No", "Small", "High", "Very high"),
col = four_cols,
lty = "dashed",
cex = 0.7,
bty = "n")
# second plot
plot(density(b_hats_no[, 2]),
xlim = c(0.009, 0.04),
main = "Standard Error",
bty = "n",
las = 1,
yaxt = "n",
ylab = "",
xlab = "",
type = "n")
polygon(density(b_hats_no[, 2]),
col = four_cols_alpha[1],
border = F)
polygon(density(b_hats_small[, 2]),
col = four_cols_alpha[2],
border = F)
polygon(density(b_hats_high[, 2]),
col = four_cols_alpha[3],
border = F)
polygon(density(b_hats_veryhigh[, 2]),
col = four_cols_alpha[4],
border = F)
lines(density(b_hats_no[, 2]),
col = four_cols[1])
lines(density(b_hats_small[, 2]),
col = four_cols[2])
lines(density(b_hats_high[, 2]),
col = four_cols[3])
lines(density(b_hats_veryhigh[, 2]),
col = four_cols[4])
abline(v = c(mean(b_hats_no[, 2]),
mean(b_hats_small[, 2]),
mean(b_hats_high[, 2]),
mean(b_hats_veryhigh[, 2])),
col = four_cols,
lty = 2)
What do you observe?
Because the measurement error is uncorrelated with X, the OLS estimators are unbiased, but the variance is inflated (larger variances, larger standard errors).
b_hats <- as.data.frame(rbind(b_hats_no,
b_hats_small,
b_hats_high,
b_hats_veryhigh))
b_hats$error <-
factor(rep(c("No", "Small", "High", "Very High"), each = nrow(b_hats) / 4),
levels = c("No", "Small", "High", "Very High"))
# first plot
p1 <- ggplot(data = b_hats, aes(x = Estimate, fill = error, color = error)) +
labs(
title = "Point Estimate",
x = "",
y = "",
fill = "Measurement Error",
color = "Measurement Error"
) +
geom_density() + # add densities
geom_vline( # manually add lines with means
xintercept = b_hats %>% # take b_hats
group_by(error) %>% # apply all calculations within groups of error
summarise(mean = mean(Estimate)) %>% # take mean of Estimate column
pull(mean), # pull the mean column
linetype = "dashed",
color = viridis(4)
) +
scale_fill_viridis(discrete = T, alpha = 0.5) +
scale_color_viridis(discrete = T) +
theme(legend.title = element_text()) +
geom_hline( # remove the bottom line for cleaner look
yintercept = 0,
color = "white" # grid line color
)
# second plot
p2 <- ggplot(data = b_hats, aes(x = `Std. Error`, fill = error, color= error)) +
labs(
title = "Standard Error",
x = "",
y = "",
fill = "Measurement Error",
color = "Measurement Error"
) +
geom_density() +
geom_vline( # manually add lines with means
xintercept = b_hats %>% # take b_hats
group_by(error) %>% # apply all calculations within groups of error
summarise(mean = mean(`Std. Error`)) %>% # take mean of `Std. Error` column
pull(mean), # pull the mean column
linetype = "dashed",
color = viridis(4)
) +
scale_fill_viridis(discrete = T, alpha = 0.5) +
scale_color_viridis(discrete = T) +
theme(legend.title = element_text()) +
geom_hline( # remove the bottom line for cleaner look
yintercept = 0,
color = "white" # grid line color
)
# put plots together with patchwork package
p1 + p2 + plot_layout(guides = "collect") +
plot_annotation(title = "The Effect of Measurement Error in Y on Estimates")
What do you observe?
Because the measurement error is uncorrelated with X, the OLS estimators are unbiased, but the variance is inflated (larger variances, larger standard errors).